library(ggplot2)
library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggmap)
library(mapdata)
## Loading required package: maps
##
## Attaching package: 'maps'
## The following object is masked from 'package:plyr':
##
## ozone
library(maps)
library(stringr)
library(viridis)
## Loading required package: viridisLite
library(maptools)
## Loading required package: sp
## Checking rgeos availability: FALSE
## Note: when rgeos is not available, polygon geometry computations in maptools depend on gpclib,
## which has a restricted licence. It is disabled by default;
## to enable gpclib, type gpclibPermit()
library(gpclib)
## General Polygon Clipper Library for R (version 1.5-5)
## Type 'class ? gpc.poly' for help
library(sp)
gpclibPermit()
## Warning in gpclibPermit(): support for gpclib will be withdrawn from
## maptools at the next major release
## [1] TRUE
Our first step is to import the data into our environment.
eda <- read.table("data/cleaned_AQ.csv", header = TRUE, sep = ",")
After importing the data, we filter only the values measuring Ozone, and then we find the mean of the arithmetic_mean values. From there, we normalize the arithmetic_means.
eda_ozone <- eda %>% filter(parameter_name == 'Ozone')
avg_mean <- mean(eda_ozone$arithmetic_mean)
eda_ozone$norm_mean <- eda_ozone$arithmetic_mean/avg_mean
To narrow our analysis, we filter first by state and then find the counties which have the greatest number of data points.
mi_ozone <- eda_ozone %>%
filter(state_name == "Michigan")
list <- mi_ozone %>%
group_by(county_name) %>%
summarize(n = n()) %>%
arrange(desc(n))
We create another data frame that just has the three counties with the greatest number of data points.
tricounty_ozone <- mi_ozone %>%
filter(county_name == list$county_name[1] |
county_name == list$county_name[2] |
county_name == list$county_name[3])
A basic plot of means over time does not show any useful correlation.
tricounty_ozone %>%
ggplot() +
geom_point(aes(x = year, y = norm_mean))
Color coding the values by county also does not let us see any correlation.
tricounty_ozone %>%
ggplot() +
geom_point(aes(x = year, y = norm_mean, color = county_name))
In the next graph, we color code the metric used to see if that pulls out any difference.
tricounty_ozone %>%
ggplot() +
geom_point(aes(x = year, y = norm_mean, color = metric_used))
Metric used does seem to correlate to different types of data. Specifically the red was associated with the daily maxima rather than the daily maximum of an average. We will remove the daily maxima from our evalation.
odd_metric = "Daily maxima of observed hourly values (between 9:00 AM and 8:00 PM)"
tricounty_ozone %>%
filter(metric_used != odd_metric) %>%
ggplot() +
geom_point(aes(x = year, y = norm_mean, color = metric_used))
Since those values blend together, we will assume they are roughly equivalent.
tricounty_ozone %>%
filter(metric_used != odd_metric) %>%
ggplot() +
geom_point(aes(x = year, y = norm_mean, color = county_name))
It appears in the graph above that there are multiple values for one county in the same year.
tricounty_ozone %>%
filter(metric_used != odd_metric) %>%
ggplot() +
geom_point(aes(x = year, y = norm_mean, color = address)) +
facet_wrap( ~ county_name)
Okay, let’s try to visualize these with respect to our Michigan location over the areas where there are maximum observed Ozone values measured during that particular part of the day, when the Pollutants data was collected.
map_MI <- get_map(location = c(lon = mean(mi_ozone$longitude),
lat = mean(mi_ozone$latitude)),
zoom = 6,
maptype = "satellite",
scale = 2)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=43.03394,-84.534692&zoom=6&size=640x640&scale=2&maptype=satellite&language=en-EN&sensor=false
ggmap(map_MI) +
geom_point(data = mi_ozone,
aes(x = longitude,
y = latitude,
colour = county_name,
alpha = 0.7),
size = 5,
shape = 21) +
guides(fill=FALSE, alpha=FALSE, size=TRUE)
## Warning: Removed 8 rows containing missing values (geom_point).
# ggmap(map_MI) +
# geom_point(data = mi_ozone, aes(x = longitude, y = latitude, colour = county_name, alpha = 0.7), size = 5, shape = 21) + guides(fill=guide_legend(reverse = TRUE, alpha=FALSE, size=TRUE))
#
#
# # guides(fill = guide_legend(reverse = TRUE, override.aes =
# # list(alpha = 1)))
Let’s try to visualize them in the distribution of the Ozone in these 3 Counties in particular in Michigan Map.
ThreeCountyMap <- qmap("michigan",
zoom = 7,
color = "color",
legend = "topleft")
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=michigan&zoom=7&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=michigan&sensor=false
## Warning: `panel.margin` is deprecated. Please use `panel.spacing` property
## instead
ThreeCountyMap +
stat_bin2d(aes(x = longitude,
y = latitude,
colour = county_name,
fill = county_name),
size = .5,
bins = 30,
alpha = 1/2,
data = tricounty_ozone)